if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load("knitr")
knitr::opts_knit$set(root.dir = "~/source/big_data/Group6/datasets")
df.train <- read.csv("../datasets/NS.TRAIN.csv")
df.test <- read.csv("../datasets/NS.TEST.csv")
Importing the DF
dim(df.train)
## [1] 88416 35
table(is.na(df.train))
##
## FALSE
## 3094560
summary(df.train)
## patient_id appointment week_day schedule_date
## Min. :3.920e+04 Min. :5030230 Min. :1.000 2016-05-03: 3389
## 1st Qu.:4.156e+12 1st Qu.:5640302 1st Qu.:2.000 2016-05-02: 3364
## Median :3.168e+13 Median :5680628 Median :3.000 2016-05-16: 3311
## Mean :1.470e+14 Mean :5675348 Mean :2.857 2016-05-05: 3285
## 3rd Qu.:9.433e+13 3rd Qu.:5725620 3rd Qu.:4.000 2016-05-10: 3179
## Max. :9.999e+14 Max. :5790484 Max. :6.000 2016-05-09: 3152
## (Other) :68736
## appointmnet_date waiting_time age is_female
## 2016-06-06: 3736 Min. : 0.00 Min. : 0.00 Min. :0.0000
## 2016-05-16: 3693 1st Qu.: 0.00 1st Qu.: 18.00 1st Qu.:0.0000
## 2016-05-30: 3636 Median : 4.00 Median : 37.00 Median :1.0000
## 2016-06-07: 3589 Mean : 10.18 Mean : 37.13 Mean :0.6502
## 2016-06-08: 3586 3rd Qu.: 15.00 3rd Qu.: 55.00 3rd Qu.:1.0000
## 2016-05-11: 3580 Max. :179.00 Max. :115.00 Max. :1.0000
## (Other) :66596
## scholarship neighbourhood region poverty
## Min. :0.00000 Jardim Camburi : 6183 Min. :1.000 Min. :0.000
## 1st Qu.:0.00000 Maria Ortiz : 4635 1st Qu.:3.000 1st Qu.:2.000
## Median :0.00000 Resistencia : 3517 Median :4.000 Median :4.000
## Mean :0.09842 Jardim da Penha: 3105 Mean :4.427 Mean :3.184
## 3rd Qu.:0.00000 Itarare : 2797 3rd Qu.:6.000 3rd Qu.:4.000
## Max. :1.00000 Centro : 2695 Max. :9.000 Max. :5.000
## (Other) :65484
## x_coor y_coor hipertension diabetes
## Min. :-20.51 Min. :-40.84 Min. :0.0000 Min. :0.00000
## 1st Qu.:-20.31 1st Qu.:-40.33 1st Qu.:0.0000 1st Qu.:0.00000
## Median :-20.30 Median :-40.31 Median :0.0000 Median :0.00000
## Mean :-20.16 Mean :-40.30 Mean :0.1975 Mean :0.07185
## 3rd Qu.:-20.27 3rd Qu.:-40.30 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :-14.86 Max. :-29.33 Max. :1.0000 Max. :1.00000
##
## alcoholism handcap sms_recieved same_day
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. : 1.000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 1.000
## Median :0.00000 Median :0.00000 Median :0.0000 Median : 1.000
## Mean :0.03019 Mean :0.02264 Mean :0.3216 Mean : 1.187
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.: 1.000
## Max. :1.00000 Max. :4.00000 Max. :1.0000 Max. :10.000
##
## week_before ever_before known_week_after known_ever_after
## Min. : 0.0000 Min. : 0.000 Min. :0 Min. :0
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.:0 1st Qu.:0
## Median : 0.0000 Median : 0.000 Median :0 Median :0
## Mean : 0.4039 Mean : 1.187 Mean :0 Mean :0
## 3rd Qu.: 0.0000 3rd Qu.: 1.000 3rd Qu.:0 3rd Qu.:0
## Max. :21.0000 Max. :83.000 Max. :0 Max. :0
##
## all_known_appointments no_show region_1
## Min. :0 Min. :0.0000 Min. :0.00000
## 1st Qu.:0 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0 Median :0.0000 Median :0.00000
## Mean :0 Mean :0.2024 Mean :0.08092
## 3rd Qu.:0 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :0 Max. :1.0000 Max. :1.00000
##
## region_2 region_3 region_4 region_5
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1423 Mean :0.1359 Mean :0.2288 Mean :0.03698
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## region_6 region_7 region_8 region_9
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00e+00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00e+00
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.00e+00
## Mean :0.1418 Mean :0.1633 Mean :0.06993 Mean :2.26e-05
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00e+00
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.00e+00
##
Display the summarize and define a business target
TBD ##### Predictor columnds explained * patient_id - Patient ID * appointment - Appointment ID * week_day - Numeric week day (1-6) * schedule_date - Schedule date * appointmnet_date - Appointmnet date * waiting_time - Diff between Schedule and Appointmnet * age - Age of patient * is_female - Sex (boolean) * scholarship - Scholarshop recieved (boolean) * neighbourhood - Neighbourhood * hipertension * diabetes * alcoholism * handcap * sms_recieved * poverty - Neighbourhood poverty rank * x_coor - Langtatude * y_coor - Latatude * region - Neighbourhood numeric region * no_show - Showed to appointment (boolean)
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7 >>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e| numeric | categorical |
|---|---|
| patient_id | schedule_date |
| appointment | appointmnet_date |
| week_day | neighbourhood |
| waiting_time | poverty |
| age | - |
| is_female | - |
| scholarship | - |
| - | - |
sapply(df.train, function(x) sum(is.na(x)))
## patient_id appointment week_day
## 0 0 0
## schedule_date appointmnet_date waiting_time
## 0 0 0
## age is_female scholarship
## 0 0 0
## neighbourhood region poverty
## 0 0 0
## x_coor y_coor hipertension
## 0 0 0
## diabetes alcoholism handcap
## 0 0 0
## sms_recieved same_day week_before
## 0 0 0
## ever_before known_week_after known_ever_after
## 0 0 0
## all_known_appointments no_show region_1
## 0 0 0
## region_2 region_3 region_4
## 0 0 0
## region_5 region_6 region_7
## 0 0 0
## region_8 region_9
## 0 0
number of missing values per column
Column to predict is no_show
since the Y column is numeric and categorical, the problem is a classification problem
df = na.omit(df.train)
par(mfrow=c(2,4));
hist(df$week_day, main="week_day", breaks = 10)
hist(log(df$waiting_time+1), main="log waiting_time", breaks = 10)
hist(sqrt(df$age), main="sqrt age", breaks = 10)
hist(df$poverty, main="poverty", breaks = 10)
hist(df$region, main="region", breaks = 10)
hist(log(df$same_day+1), main="log same_day", breaks = 10)
hist(log(df$week_before+1), main="log week_before", breaks = 10)
hist(log(df$ever_before+1), main="log ever_before", breaks = 10)
# looking for no_show frequencies over waiting time
eda_waiting_time<-aggregate(df.train$no_show, by=list(log(df.train$waiting_time+1)), FUN=mean)
names(eda_waiting_time)=c("waiting time","No show prob")
plot(eda_waiting_time, eda_waiting_time$x, type="p", main="No show prob vs. log waiting time")
We can identify a group of confident shows around 100 waiting time (days), or 4.5 in log, whereas the majority is around probability of 0.3
eda_week_day<-aggregate(df.train$no_show, by=list(df.train$week_day), FUN=mean)
plot(eda_week_day, eda_week_day$x, type="l", main="No show prob vs. week_day")
eda_age<-aggregate(df.train$no_show, by=list(sqrt(df.train$age)), FUN=mean)
plot(eda_age, eda_age$x, type="p", main="No show prob vs. sqrt age")
eda_age<-aggregate(df.train$no_show, by=list(sqrt(df.train$age)), FUN=mean)
plot(eda_age, eda_age$x, type="p", main="No show prob vs. sqrt age")
eda_age<-aggregate(df.train$no_show, by=list((df.train$age)), FUN=mean)
plot(eda_age, eda_age$x, type="p", main="No show prob vs. age")
eda_scholarship<-aggregate(df.train$no_show, by=list((df.train$scholarship)), FUN=mean)
plot(eda_scholarship, eda_scholarship$x, type="h", col = c("light blue", "blue"), lwd = 100, ylim = c(0.1,0.4), main="No show prob vs. scholarship")
eda_sms_recieved<-aggregate(df.train$no_show, by=list((df.train$sms_recieved)), FUN=mean)
plot(eda_sms_recieved, eda_sms_recieved$x, type="h", col = c("light blue", "blue"), lwd = 100, ylim = c(0.1,0.4), main="No show prob vs. sms_recieved")
#### Low probability graphs
par(mfrow=c(2,4))
eda_is_female<-aggregate(df.train$no_show, by=list(df.train$is_female), FUN=mean)
plot(eda_is_female, eda_is_female$x, type="l", main="No show prob vs. is_female", ylim=c(0,1))
eda_poverty<-aggregate(df.train$no_show, by=list((df.train$poverty)), FUN=mean)
plot(eda_poverty, eda_poverty$x, type="l", main="No show prob vs. poverty", ylim=c(0,1))
eda_diabetes<-aggregate(df.train$no_show, by=list((df.train$diabetes)), FUN=mean)
plot(eda_diabetes, eda_diabetes$x, type="l", main="No show prob vs. diabetes", ylim=c(0,1))
eda_hipertension<-aggregate(df.train$no_show, by=list((df.train$hipertension)), FUN=mean)
plot(eda_hipertension, eda_hipertension$x, type="l", main="No show prob vs. hipertension", ylim=c(0,1))
eda_handcap<-aggregate(df.train$no_show, by=list((df.train$handcap)), FUN=mean)
plot(eda_handcap, eda_handcap$x, type="l", main="No show prob vs. handcap", ylim=c(0,1))
eda_same_day<-aggregate(df.train$no_show, by=list((df.train$same_day)), FUN=mean)
plot(eda_same_day, eda_same_day$x, type="l", main="No show prob vs. same_day", ylim=c(0,1))
eda_week_before<-aggregate(df.train$no_show, by=list((df.train$week_before)), FUN=mean)
plot(eda_week_before, eda_week_before$x, type="l", main="No show prob vs. week_before", ylim=c(0,1))
eda_ever_before<-aggregate(df.train$no_show, by=list((df.train$ever_before)), FUN=mean)
plot(eda_ever_before, eda_ever_before$x, type="l", main="No show prob vs. ever_before", ylim=c(0,1))
logit_model <- glm(no_show ~ age+
waiting_time+
=======
<<<<<<< HEAD
# converting to log and sqrt
df.train$log_waiting_time <- log(df.train$waiting_time+1)
df.train$sqrt_age <- sqrt(df.train$age)
LM
logit_model <- glm(no_show ~ sqrt_age+
log_waiting_time+
scholarship+
sms_recieved+
region, data = df.train, family = binomial)
summary (logit_model)
##
## Call:
## glm(formula = no_show ~ sqrt_age + log_waiting_time + scholarship +
## sms_recieved + region, family = binomial, data = df.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4793 -0.7161 -0.4799 -0.3951 2.3429
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.927551 0.031864 -60.494 < 2e-16 ***
## sqrt_age -0.060380 0.003797 -15.903 < 2e-16 ***
## log_waiting_time 0.562168 0.007544 74.521 < 2e-16 ***
## scholarship 0.272104 0.028144 9.668 < 2e-16 ***
## sms_recieved -0.173436 0.020176 -8.596 < 2e-16 ***
## region -0.022630 0.004172 -5.424 5.84e-08 ***
=======
LM
df.train$log_waiting_time <- log(df.train$waiting_time+1)
logit_model <- glm(no_show ~ age+
log_waiting_time+
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
scholarship+
sms_recieved+
region, data = df.train, family = binomial)
summary (logit_model)
##
## Call:
## glm(formula = no_show ~ age + waiting_time + scholarship + sms_recieved +
## region, family = binomial, data = df.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3459 -0.6745 -0.5807 -0.4953 2.1922
##
## Coefficients:
<<<<<<< HEAD
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3894915 0.0248995 -55.804 < 2e-16 ***
## age -0.0075171 0.0003781 -19.882 < 2e-16 ***
## waiting_time 0.0228074 0.0005443 41.903 < 2e-16 ***
## scholarship 0.2121474 0.0274403 7.731 1.07e-14 ***
## sms_recieved 0.3624426 0.0187984 19.280 < 2e-16 ***
## region -0.0306387 0.0040925 -7.487 7.07e-14 ***
=======
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0784036 0.0213372 -97.407 <2e-16 ***
## age -0.0078358 0.0003867 -20.262 <2e-16 ***
## log_waiting_time 0.5632924 0.0075464 74.644 <2e-16 ***
## scholarship 0.2538919 0.0281669 9.014 <2e-16 ***
## sms_recieved -0.1799441 0.0202056 -8.906 <2e-16 ***
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 89074 on 88415 degrees of freedom
<<<<<<< HEAD
## Residual deviance: 85524 on 88410 degrees of freedom
## AIC: 85536
##
## Number of Fisher Scoring iterations: 4
plot(logit_model)




=======
<<<<<<< HEAD
## Residual deviance: 81699 on 88410 degrees of freedom
## AIC: 81711
##
## Number of Fisher Scoring iterations: 4
plot(logit_model)




=======
## Residual deviance: 81565 on 88411 degrees of freedom
## AIC: 81575
##
## Number of Fisher Scoring iterations: 4
plot(logit_model)




>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
CART
pacman::p_load("tree")
noshow.CART <- tree(no_show ~ week_day+
<<<<<<< HEAD
log_waiting_time+
sqrt_age+
=======
waiting_time+
age+
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
is_female+
scholarship+
hipertension+
diabetes+
alcoholism+
handcap+
sms_recieved+
poverty+
region ,data = df.train)
plot(noshow.CART)
text(noshow.CART, pretty = 0, cex=0.5)
<<<<<<< HEAD

summary(noshow.CART)
##
## Regression tree:
## tree(formula = no_show ~ week_day + log_waiting_time + sqrt_age +
## is_female + scholarship + hipertension + diabetes + alcoholism +
## handcap + sms_recieved + poverty + region, data = df.train)
## Variables actually used in tree construction:
## [1] "log_waiting_time" "sqrt_age"
=======

summary(noshow.CART)
##
## Regression tree:
## tree(formula = no_show ~ week_day + waiting_time + age + is_female +
## scholarship + hipertension + diabetes + alcoholism + handcap +
## sms_recieved + poverty + region, data = df.train)
## Variables actually used in tree construction:
## [1] "waiting_time" "age"
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
## Number of terminal nodes: 3
## Residual mean deviance: 0.1466 = 12960 / 88410
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.33060 -0.33060 -0.04566 0.00000 -0.04566 0.95430
RF
<<<<<<< HEAD
pacman::p_load("randomForest")
set.seed(7)
noshow.RF <- randomForest(no_show ~ week_day
+waiting_time
+age
+is_female
+scholarship
+sms_recieved
+poverty
+region
, data = df.train, na.action=na.omit, type="classification", ntree=100)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
plot(noshow.RF)

#importance(noshow.RF)
varImpPlot(noshow.RF)

=======
library("randomForest")
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(7)
noshow.RF <- randomForest(no_show ~ week_day+
<<<<<<< HEAD
log_waiting_time+
sqrt_age+
=======
waiting_time+
age+
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
is_female+
scholarship+
sms_recieved+
poverty+
region, data = df.train, na.action=na.omit)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
plot(noshow.RF)
<<<<<<< HEAD

#importance(noshow.RF)
varImpPlot(noshow.RF)

=======

#importance(noshow.RF)
varImpPlot(noshow.RF)

>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
GBM
# install.packages("gbm",repos = "http://cran.us.r-project.org")
#library("gbm")
pacman::p_load("gbm")
set.seed(7) #same seed to repeat the RF
no_show.gbm <- gbm (no_show ~ week_day+
<<<<<<< HEAD
log_waiting_time+
sqrt_age+
=======
waiting_time+
age+
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
is_female+
scholarship+
sms_recieved+
poverty+
<<<<<<< HEAD
region ,data = df.train, n.trees = 1000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
=======
<<<<<<< HEAD
region ,data = df.train, n.trees = 1000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
## Distribution not specified, assuming bernoulli ...
no_show.gbm
## gbm(formula = no_show ~ week_day + log_waiting_time + sqrt_age +
## is_female + scholarship + sms_recieved + poverty + region,
## data = df.train, n.trees = 1000, interaction.depth = 4, shrinkage = 0.2,
## verbose = F)
=======
region,data = df.train, n.trees = 1000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
## Distribution not specified, assuming bernoulli ...
no_show.gbm
## gbm(formula = no_show ~ week_day + waiting_time + age + is_female +
## scholarship + sms_recieved + poverty + region, data = df.train,
## n.trees = 1000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
## A gradient boosted model with bernoulli loss function.
## 1000 iterations were performed.
## There were 8 predictors of which 8 had non-zero influence.
summary(no_show.gbm)

<<<<<<< HEAD
## var rel.inf
## log_waiting_time log_waiting_time 56.837348
## sqrt_age sqrt_age 22.970307
## region region 5.406216
## poverty poverty 4.646453
## week_day week_day 4.380362
## sms_recieved sms_recieved 2.836217
## is_female is_female 1.653932
## scholarship scholarship 1.269164
=======
## var rel.inf
## waiting_time waiting_time 56.837348
## age age 22.970307
## region region 5.406216
## poverty poverty 4.646453
## week_day week_day 4.380362
## sms_recieved sms_recieved 2.836217
## is_female is_female 1.653932
## scholarship scholarship 1.269164
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
Model evaluation
<<<<<<< HEAD
Logistic Model
threshold = 0.6
=======
<<<<<<< HEAD
# converting the test for log and sqrt
df.test$log_waiting_time <- log(df.test$waiting_time+1)
df.test$sqrt_age <- sqrt(df.test$age)
LM
threshold = 0.6
=======
LM
threshold = 0.6
df.test$log_waiting_time <- log(df.test$waiting_time+1)
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
fitted.lm.results <- predict(logit_model,df.test,type='response')
lm.prediction <- ifelse(fitted.lm.results > threshold,1,0)
lm.accuracy <- mean(lm.prediction == df.test$no_show)
lm.accuracy
<<<<<<< HEAD
## [1] 0.7996381
=======
<<<<<<< HEAD
## [1] 0.8000905
=======
## [1] 0.800181
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
#install.packages("caret",repos = "http://cran.us.r-project.org")
#library("caret")
#install.packages("e1071",repos = "http://cran.us.r-project.org")
#library("e1071")
pacman::p_load("caret")
pacman::p_load("e1071")
confusionMatrix(data = lm.prediction, reference = df.test$no_show)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
<<<<<<< HEAD
## 0 17660 4402
## 1 27 16
=======
<<<<<<< HEAD
## 0 17685 4417
## 1 2 1
##
## Accuracy : 0.8001
## 95% CI : (0.7948, 0.8053)
## No Information Rate : 0.8001
## P-Value [Acc > NIR] : 0.5107
##
## Kappa : 2e-04
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9998869
## Specificity : 0.0002263
## Pos Pred Value : 0.8001538
## Neg Pred Value : 0.3333333
## Prevalence : 0.8001357
## Detection Rate : 0.8000452
## Detection Prevalence : 0.9998643
## Balanced Accuracy : 0.5000566
=======
## 0 17686 4416
## 1 1 2
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
##
## Accuracy : 0.7996
## 95% CI : (0.7943, 0.8049)
## No Information Rate : 0.8001
## P-Value [Acc > NIR] : 0.5773
##
## Kappa : 0.0033
## Mcnemar's Test P-Value : <2e-16
##
<<<<<<< HEAD
## Sensitivity : 0.998473
## Specificity : 0.003622
## Pos Pred Value : 0.800471
## Neg Pred Value : 0.372093
## Prevalence : 0.800136
## Detection Rate : 0.798914
## Detection Prevalence : 0.998055
## Balanced Accuracy : 0.501048
=======
## Sensitivity : 0.9999435
## Specificity : 0.0004527
## Pos Pred Value : 0.8001991
## Neg Pred Value : 0.6666667
## Prevalence : 0.8001357
## Detection Rate : 0.8000905
## Detection Prevalence : 0.9998643
## Balanced Accuracy : 0.5001981
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
##
## 'Positive' Class : 0
##
cross.table <- table(lm.prediction, df.test$no_show)
l <- nrow(cross.table)
if(l< 2) {
cross.table <- rbind(cross.table, c(0,0))
}
accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
<<<<<<< HEAD
f1 <- 2*(recall*precision)/(recall+precision)
paste("Accuracy -",accuracy)
## [1] "Accuracy - 0.799638090929654"
paste("Precision -",precision)
## [1] "Precision - 0.372093023255814"
paste("Recall -",recall)
## [1] "Recall - 0.00362154821186057"
paste("F1 -",f1)
## [1] "F1 - 0.00717327953373683"
#install.packages("pROC",repos = "http://cran.us.r-project.org")
pacman::p_load("pROC")
plot(roc(df.test$no_show, fitted.lm.results, direction="<"), col="blue", main="Left ROC curve")

CART
threshold = 0.3
=======
paste("accuracy -",accuracy)
<<<<<<< HEAD
## [1] "accuracy - 0.800090477267587"
paste("precision -",precision)
## [1] "precision - 0.333333333333333"
paste("recall -",recall)
## [1] "recall - 0.000226346763241286"
=======
## [1] "accuracy - 0.800180954535173"
paste("precision -",precision)
## [1] "precision - 0.666666666666667"
paste("recall -",recall)
## [1] "recall - 0.000452693526482571"
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
#install.packages("pROC",repos = "http://cran.us.r-project.org")
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
plot(roc(df.test$no_show, fitted.lm.results, direction="<"), col="blue", main="Left ROC curve")
<<<<<<< HEAD

CART
threshold = 0.3
=======

CART
threshold = 0.2
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
fitted.cart.results <- predict(noshow.CART,df.test)
summary(fitted.cart.results)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04566 0.04566 0.22663 0.20236 0.33062 0.33062
cart.prediction <- ifelse(fitted.cart.results > threshold,1,0)
summary(cart.prediction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
<<<<<<< HEAD
## 0.0000 0.0000 0.0000 0.3766 1.0000 1.0000
=======
<<<<<<< HEAD
## 0.0000 0.0000 0.0000 0.3766 1.0000 1.0000
=======
## 0.0000 0.0000 1.0000 0.6495 1.0000 1.0000
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
cross.table <- table(cart.prediction, df.test$no_show)
l <- nrow(cross.table)
if(l< 2) {
cross.table <- rbind(cross.table, c(0,0))
}
accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
<<<<<<< HEAD
f1 <- 2*(recall*precision)/(recall+precision)
paste("Accuracy -",accuracy)
## [1] "Accuracy - 0.663605519113323"
paste("Precision -",precision)
## [1] "Precision - 0.318716962998558"
paste("Recall -",recall)
## [1] "Recall - 0.600497962879131"
paste("F1 -",f1)
## [1] "F1 - 0.416418144718255"
=======
paste("accuracy -",accuracy)
<<<<<<< HEAD
## [1] "accuracy - 0.663605519113323"
paste("precision -",precision)
## [1] "precision - 0.318716962998558"
paste("recall -",recall)
## [1] "recall - 0.600497962879131"
RF
threshold <- 0.6
fitted.rf.results <- predict(noshow.RF,df.test)
summary(fitted.rf.results)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02370 0.05721 0.21576 0.20392 0.30558 0.67046
rf.prediction <- ifelse(fitted.rf.results > threshold,1,0)
summary(rf.prediction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0000000 0.0000000 0.0002262 0.0000000 1.0000000
=======
## [1] "accuracy - 0.515539470707985"
paste("precision -",precision)
## [1] "precision - 0.280908267743958"
paste("recall -",recall)
## [1] "recall - 0.912856496152105"
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
RF
threshold <- 0.5
fitted.rf.results <- predict(noshow.RF,df.test)
summary(fitted.rf.results)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02125 0.05853 0.21249 0.20366 0.30583 0.69628
rf.prediction <- ifelse(fitted.rf.results > threshold,1,0)
summary(rf.prediction)
<<<<<<< HEAD
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.00846 0.00000 1.00000
cross.table <- table(rf.prediction, df.test$no_show)
l <- nrow(cross.table)
if(l< 2) {
cross.table <- rbind(cross.table, c(0,0))
}
accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
f1 <- 2*(recall*precision)/(recall+precision)
paste("Accuracy -",accuracy)
## [1] "Accuracy - 0.801266681746211"
paste("Precision -",precision)
## [1] "Precision - 0.566844919786096"
paste("Recall -",recall)
## [1] "Recall - 0.0239927569035763"
paste("F1 -",f1)
## [1] "F1 - 0.0460369163952226"
GBM
threshold <- 0.6
=======
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.006876 0.000000 1.000000
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
cross.table <- table(rf.prediction, df.test$no_show)
cross.table
##
## rf.prediction 0 1
<<<<<<< HEAD
## 0 17687 4413
## 1 0 5
=======
## 0 17632 4321
## 1 55 97
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
paste("accuracy -",accuracy)
<<<<<<< HEAD
## [1] "accuracy - 0.800361909070346"
paste("precision -",precision)
## [1] "precision - 1"
paste("recall -",recall)
## [1] "recall - 0.00113173381620643"
GBM
threshold <- 0.6
=======
## [1] "accuracy - 0.802035738520697"
paste("precision -",precision)
## [1] "precision - 0.638157894736842"
paste("recall -",recall)
## [1] "recall - 0.0219556360344047"
GBM
threshold <- 0.1
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e
fitted.gbm.results <- predict(no_show.gbm,df.test, n.trees = 1000)
summary(fitted.gbm.results)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.0588 -3.0654 -1.3217 -1.7214 -0.8148 1.7028
gbm.prediction <- ifelse(fitted.gbm.results > threshold,1,0)
summary(gbm.prediction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
<<<<<<< HEAD
## 0.00000 0.00000 0.00000 0.00294 0.00000 1.00000
cross.table <- table(gbm.prediction, df.test$no_show)
l <- nrow(cross.table)
if(l< 2) {
cross.table <- rbind(cross.table, c(0,0))
}
accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
f1 <- 2*(recall*precision)/(recall+precision)
paste("Accuracy -",accuracy)
## [1] "Accuracy - 0.800452386337933"
paste("Precision -",precision)
## [1] "Precision - 0.553846153846154"
paste("Recall -",recall)
## [1] "Recall - 0.00814848347668628"
paste("F1 -",f1)
## [1] "F1 - 0.0160606736560339"
=======
<<<<<<< HEAD
## 0.00000 0.00000 0.00000 0.00294 0.00000 1.00000
=======
## 0.00000 0.00000 0.00000 0.01515 0.00000 1.00000
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
cross.table <- table(gbm.prediction, df.test$no_show)
cross.table
##
## gbm.prediction 0 1
<<<<<<< HEAD
## 0 17658 4382
## 1 29 36
=======
## 0 17517 4253
## 1 170 165
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
accuracy <- (cross.table[1,1]+cross.table[2,2])/ (cross.table[1,1]+cross.table[2,2]+cross.table[1,2]+cross.table[2,1])
precision <- cross.table[2,2]/(cross.table[2,2]+cross.table[2,1])
recall <- cross.table[2,2]/(cross.table[2,2]+cross.table[1,2])
paste("accuracy -",accuracy)
<<<<<<< HEAD
## [1] "accuracy - 0.800452386337933"
paste("precision -",precision)
## [1] "precision - 0.553846153846154"
paste("recall -",recall)
## [1] "recall - 0.00814848347668628"
=======
## [1] "accuracy - 0.799909522732413"
paste("precision -",precision)
## [1] "precision - 0.492537313432836"
paste("recall -",recall)
## [1] "recall - 0.0373472159348121"
>>>>>>> 4b865442aa082d596a80744b97f606f45d5577c7
>>>>>>> 267c5cba95fa80cf8cacbb09a531c7f4ee3af05e